This notebook is using the data from the krill tracks in DLTdv8a and testing them for auto-correlation and cross-correlation with each other. These correlations are then run through a Random Forrest Model to see which experimental conditions affect the fit of the correlation models the most.
The first thing we need to do is load in the data and change the experimental factors into numeric operators so that we can generate a matrix of experimental conditions.
### This is a short list of all of the library packages that are needed to run the code in this notebook.
### If you don't already have these packages installed use install.packages('...') prior to the library call to install the package from Cran-r.
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(hexbin)
## Warning: package 'hexbin' was built under R version 4.1.3
library(diptest)
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 1.1.1 --
## v broom 1.0.5 v rsample 1.2.0
## v dials 1.2.0 v tibble 3.2.1
## v dplyr 1.1.2 v tidyr 1.3.0
## v infer 1.0.5 v tune 1.1.2
## v modeldata 1.2.0 v workflows 1.1.3
## v parsnip 1.1.1 v workflowsets 1.0.1
## v purrr 1.0.2 v yardstick 1.2.0
## v recipes 1.0.8
## Warning: package 'dials' was built under R version 4.1.3
## Warning: package 'scales' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'workflows' was built under R version 4.1.3
## Warning: package 'workflowsets' was built under R version 4.1.3
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x recipes::step() masks stats::step()
## * Use tidymodels_prefer() to resolve common conflicts.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library('matrixStats')
## Warning: package 'matrixStats' was built under R version 4.1.3
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
rm(list=ls(all=TRUE)) ## removes the previous workspace and environment so that we only have the data we need loaded in the session
load("~/Post-doc/Data/Total Merged Data File (Oct 5 2023).RData")
#load('~/Work/Data/Antarctic/CC.TotalData.2022.07.20.RData')
#load('~/Work/Data/Antarctic/TotalMergedDataFile.2023.04.12.RData')
#load('~/Work/Data/Antarctic/Total Merged Data File (June 23 2023).RData')
#load('~/Work/Data/Antarctic/Total Merged Data File (July 24 2023).RData')
#load('~/Work/Data/Antarctic/TotalMergedDataFile.2023.09.20.RData')
#load('~/Work/Data/Antarctic/CC.TotalData.2023.09.20.RData') ## loads the most recent version of the data, after it has been cleaned, processed and transformed into a singular dataframe, that has then been saved as a .Rdata file.
CC.TotalData$Chlorophyll <- as.numeric(as.character(CC.TotalData$Chlorophyll)) ## turns each factor level into a numeric operator
## Warning: NAs introduced by coercion
CC.TotalData$Light <- as.numeric((CC.TotalData$Light)) - 1 ##as above
CC.TotalData$Guano <- as.numeric((CC.TotalData$Guano)) - 1 ##as above
CC.TotalData$Flow.rate <- as.numeric(as.character(CC.TotalData$Flow.rate)) ##as above
CC.TotalData$Flow.Direction <- as.numeric(as.character(CC.TotalData$Flow.Direction)) ##as above
## Warning: NAs introduced by coercion
Using the numeric operators for each experimental factor we now create vectors of each and create a matrix grid of each set of unique experimental conditions. This will also set parameter ranges based on value combinations used in experiments.
# Vectors of all possible conditions combinations
frs <- unique(CC.TotalData$Flow.rate)
chls <- unique(CC.TotalData$Chlorophyll)
guans <- unique(CC.TotalData$Guano)
lights <- unique(CC.TotalData$Light)
#frs <- as.numeric(as.character(unique(CC.TotalData$Flow.rate)))
#chls <- as.numeric(as.character(unique(CC.TotalData$Chlorophyll)))
#guans <- as.numeric(as.character(unique(CC.TotalData$Guano)))
#lights <- as.numeric(as.character(unique(CC.TotalData$Light)))
#guans <- c(1,2)
#lights <- c(1,2)
conditions <- expand.grid(frs,chls,guans,lights) ## creates a grid of all possible condition combinations
The next chunk looks at velocity and how it effects krill swimming behaviour. Firstly we loop through every combination of conditions and bin velocity that has been smoothed by the smoothing spline (See notebook08) into 30-step bins (30 fps = 1 second).
for (i in 1:dim(conditions)[1]) ## start of the loop for each set of conditions
{
velocity <- CC.TotalData$smooth.v[ ## takes the smoothed velocity from the data and creates a vector of it
(CC.TotalData$Flow.rate==conditions[i,1] & ## takes only the smoothed velocity for that set of conditions
CC.TotalData$Chlorophyll==conditions[i,2] & ##as above
as.numeric(CC.TotalData$Guano)==conditions[i,3] & ##as above
as.numeric(CC.TotalData$Light)==conditions[i,4])] ##as above
if (length(velocity) <= 1 | length(velocity[!is.na(velocity)])==0) { ## checks to see if we have velocity data for that set of experimental conditions, if not then it leaves that set of conditions blank (with NA's)
conditions[i,5:9] <- NA } else { # Otherwise if we have a tank experiment...
# Because we're using the smoothed data, we'll average
# the velocities into 30-step (1 second) bins (30 fps = 1 second)
idx <- 1:length(velocity) ## creates a vector of the same length as the velocity vector
bx <- seq(from=0.5, to=length(velocity), by=30) ## creates a sequence of breaks in increments of 30 the length of the velocity vector
velocity <- binMeans(y = velocity, x = idx, bx = bx) ## takes the velocity vector and puts it into the bins we created (bins the data)
# Now we'll do the same thing for headings and bin those too
# first horizontal heading
heading.h <- CC.TotalData$smooth.h.heading[
(CC.TotalData$Flow.rate==conditions[i,1] &
CC.TotalData$Chlorophyll==conditions[i,2] &
as.numeric(CC.TotalData$Guano)==conditions[i,3] &
as.numeric(CC.TotalData$Light)==conditions[i,4])]
heading.h <- binMeans(y=heading.h, x = idx, bx = bx)
# then vertical heading
heading.v <- CC.TotalData$smooth.v.heading[
(CC.TotalData$Flow.rate==conditions[i,1] &
CC.TotalData$Chlorophyll==conditions[i,2] &
as.numeric(CC.TotalData$Guano)==conditions[i,3] &
as.numeric(CC.TotalData$Light)==conditions[i,4])]
heading.v <- binMeans(y= heading.v, x = idx, bx = bx)
# Now we start adding columns on to conditions matrix
# These are the parameters that will be used in the model simulations
# First we need to run a correlation test to generate the autocorrelation coefficient for velocity
c<-cor.test(log10(velocity[1:length(velocity)-1]), ## takes all steps of velocity except last row
log10(velocity[2:length(velocity)])) ## takes all steps of velocity except first row
conditions[i,5] <- c$estimate ## c$estimate here is the correlation value (R) of the fit, need to square it to get R^2 value
# Next we save the coefficients of the autocorrelation (log) linear fit
v0 <- log10(velocity[1:length(velocity)-1]) ## creates a vector of all steps of velocity except last row
v1 <- log10(velocity[2:length(velocity)]) ## creates a vector of all steps of velocity except first row
df <- data.frame(v0,v1) ## turns the vectors into a data frame
fit <- lm(v1 ~ v0 + 1, data = df) ## saves the linear model fit (lm) of the data frame as it's own vector
conditions[i,6] <- fit$coefficients[2] # Saves the velocity lm slope in column 6 of the data frame of the conditions
conditions[i,7] <- fit$coefficients[1] # As above for Intercept
conditions[i,8] <- sd(fit$residuals) # As above for Variance
# Because we know there is bi-modal swimming in some individuals we then performed a dip test to look for bi-modal means in the velocity
# (doesn't work great)
d <- dip.test(velocity)
conditions[i,9] <- d$p.value # saves the P val for the dip test in column 9
#hist(log10(velocity),100, ## plots a histogram of logged velocity with the title as the p-value of the dip test
# main=d$p.value)
## Now we do the same correlation test for each set of headings data and test it against itself and each other
## Horizontal heading and velocity
c <- cor.test(log10(velocity),abs(heading.h)) ## runs the correlation test
plot(log10(velocity),abs(heading.h), ## plots the two values against each other, title as the R value of the correlation fit
main = c$estimate)
## Vertical heading and velocity
c <- cor.test(log10(velocity),abs(heading.v)) ## as above
plot(log10(velocity),abs(heading.v),
main = c$estimate)
## Vertical and Horizontal heading
c <- cor.test(abs(heading.v),abs(heading.h)) ## as above
plot(abs(heading.v),abs(heading.h),
main = c$estimate)
#dview <- data.frame(logvel = log10(velocity), ## making a data frame of each variable to see the interactions
## without having to load the data each time
# heading.h.abs=abs(heading.h),
# heading.v.abs=abs(heading.v))
#ggpairs(dview, ## making a ggpairs plot
##(all combinations of variables against each other with their correlation values)
## works in the console but not when knitting the code together
# title=paste(conditions$flow.rate[i],
# conditions$chloro[i],
# conditions$guano[i],
# conditions$light[i], sep='_'))
# Now we go through same process with heading angles as we did with velocity to save the linear model and it's slope, intercept and residuals
# Note: assuming these params are independent for now
# (correlations are pretty low)
# Horizontal heading
h0 <- (heading.h[1:length(heading.h)-1])
h1 <- (heading.h[2:length(heading.h)])
plot(h0, h1, main = "Heading autocorrel")
df <- data.frame(h0,h1)
fit <- lm(h1 ~ h0 + 1, data = df)
conditions[i,10] <- fit$coefficients[2] # Slope
conditions[i,11] <- fit$coefficients[1] # Intercept
conditions[i,12] <- sd(fit$residuals) # Variance
# Vertical heading
h0 <- (heading.v[1:length(heading.v)-1])
h1 <- (heading.v[2:length(heading.v)])
df <- data.frame(h0,h1)
fit <- lm(h1 ~ h0 + 1, data = df)
conditions[i,13] <- fit$coefficients[2] # Slope
conditions[i,14] <- fit$coefficients[1] # Intercept
conditions[i,15] <- sd(fit$residuals) # Variance
}
}
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## now we save the data frame for the full set of experimental conditions and label each column appropriately
conditions <- setNames(conditions,c(
"flow.rate","chlorophyll","guano","light",
"vel.corr.val","vel.slope","vel.intercept","vel.sigma","dip.test",
"h.slope","h.intercept","h.sigma",
"v.slope","v.intercept","v.sigma"))
Now that we have our auto- and cross-correlation fits, slopes, intercepts and variance we can plot these for each set of experimental conditions and test to see what set of conditions causes the most variance, or the mean to shift the most. To do this we are using a Random Forest Model.
Using random forest to fit the missing values for autocorrelation. This first block is for fitting velocity.
Idata <- which(!is.na(conditions[,5])) # Index of data
Iblank <- which(is.na(conditions[,5])) # Index of blank data
conditions_data <- conditions[Idata,]
conditions_blank <- conditions[Iblank,]
# First do velocity
conditions.rf.vel.slope <- randomForest(vel.slope ~ ., ## creates a vector of the Random Forest Model based on the slope for velocities correlation
data=select(conditions_data,c(1,2,3,4,6)), ## takes the first 4 columns (experimental conditions)
## and the slope for those conditions (column 6)
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.vel.slope) ## plots the random forest model for velocity slope
varImpPlot(conditions.rf.vel.slope) ##
conditions_blank$vel.slope <- predict( ## fills the blank data frame with no conditions with predictions based the Random Forest models fit
conditions.rf.vel.slope,conditions_blank[,1:4])
fit.vel.slope <- predict(conditions.rf.vel.slope,conditions_data[,1:4]) ## predictions based on the actual conditions data
plot(fit.vel.slope,conditions_data$vel.slope) ## plots the 2 sets of predictions against each other
plot(conditions_blank$chlorophyll,conditions_blank$vel.slope) ## plots the predicted velocity slope for each level of chlorophyll
plot(conditions_blank$flow.rate,conditions_blank$vel.slope) ##as above for flow rate
plot(conditions_blank$light,conditions_blank$vel.slope) ##as above for light
plot(conditions_blank$guano,conditions_blank$vel.slope) ##as above for guano
## Now we do the same again for the intercept of the correlation fit for velocity
conditions.rf.vel.intercept <- randomForest(vel.intercept ~ .,
data=select(conditions_data,c(1,2,3,4,7)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.vel.intercept)
varImpPlot(conditions.rf.vel.intercept)
conditions_blank$vel.intercept <- predict(
conditions.rf.vel.intercept,conditions_blank[,1:4])
fit.vel.intercept <- predict(
conditions.rf.vel.intercept,conditions_data[,1:4])
plot(fit.vel.intercept,conditions_data$vel.intercept)
plot(conditions_blank$chlorophyll,conditions_blank$vel.intercept)
plot(conditions_blank$flow.rate,conditions_blank$vel.intercept)
plot(conditions_blank$light,conditions_blank$vel.intercept)
plot(conditions_blank$guano,conditions_blank$vel.intercept)
## Now we do the same again for the variance (sigma) of the correlation fit for velocity
conditions.rf.vel.sigma <- randomForest(vel.sigma ~ .,
data=select(conditions_data,c(1,2,3,4,8)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.vel.sigma)
varImpPlot(conditions.rf.vel.sigma)
conditions_blank$vel.sigma <- predict(
conditions.rf.vel.sigma,conditions_blank[,1:4])
fit.vel.sigma <- predict(conditions.rf.vel.sigma,conditions_data[,1:4])
plot(fit.vel.sigma,conditions_data$vel.sigma)
plot(conditions_blank$chlorophyll,conditions_blank$vel.sigma)
plot(conditions_blank$flow.rate,conditions_blank$vel.sigma)
plot(conditions_blank$light,conditions_blank$vel.sigma)
plot(conditions_blank$guano,conditions_blank$vel.sigma)
The next section does the same things as the block above did for velocity but for horizontal heading (i.e. relative to flow).
# Next do horizontal heading
conditions.rf.h.slope <- randomForest(h.slope ~ .,
data=select(conditions_data,c(1,2,3,4,10)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.h.slope)
varImpPlot(conditions.rf.h.slope)
conditions_blank$h.slope <- predict(
conditions.rf.h.slope,conditions_blank[,1:4])
fit.h.slope <- predict(conditions.rf.h.slope,conditions_data[,1:4])
plot(fit.h.slope,conditions_data$h.slope)
plot(conditions_blank$chlorophyll,conditions_blank$h.slope)
plot(conditions_blank$flow.rate,conditions_blank$h.slope)
plot(conditions_blank$light,conditions_blank$h.slope)
plot(conditions_blank$guano,conditions_blank$h.slope)
conditions.rf.h.intercept <- randomForest(h.intercept ~ .,
data=select(conditions_data,c(1,2,3,4,11)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.h.intercept)
varImpPlot(conditions.rf.h.intercept)
conditions_blank$h.intercept <- predict(
conditions.rf.h.intercept,conditions_blank[,1:4])
fit.h.intercept <- predict(
conditions.rf.h.intercept,conditions_data[,1:4])
plot(fit.h.intercept,conditions_data$h.intercept)
plot(conditions_blank$chlorophyll,conditions_blank$h.intercept)
plot(conditions_blank$flow.rate,conditions_blank$h.intercept)
plot(conditions_blank$light,conditions_blank$h.intercept)
plot(conditions_blank$guano,conditions_blank$h.intercept)
conditions.rf.h.sigma <- randomForest(h.sigma ~ .,
data=select(conditions_data,c(1,2,3,4,12)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.h.sigma)
varImpPlot(conditions.rf.h.sigma)
conditions_blank$h.sigma <- predict(
conditions.rf.h.sigma,conditions_blank[,1:4])
fit.h.sigma <- predict(conditions.rf.h.sigma,conditions_data[,1:4])
plot(fit.h.sigma,conditions_data$h.sigma)
plot(conditions_blank$chlorophyll,conditions_blank$h.sigma)
plot(conditions_blank$flow.rate,conditions_blank$h.sigma)
plot(conditions_blank$light,conditions_blank$h.sigma)
plot(conditions_blank$guano,conditions_blank$h.sigma)
The next section does the same things as the blocks above did for velocity and horizontal heading but for vertical heading (i.e. relative to surface, aka pitch).
# Next do vertical heading
conditions.rf.v.slope <- randomForest(v.slope ~ .,
data=select(conditions_data,c(1,2,3,4,13)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.v.slope)
varImpPlot(conditions.rf.v.slope)
conditions_blank$v.slope <- predict(
conditions.rf.v.slope,conditions_blank[,1:4])
fit.v.slope <- predict(conditions.rf.v.slope,conditions_data[,1:4])
plot(fit.v.slope,conditions_data$v.slope)
plot(conditions_blank$chlorophyll,conditions_blank$v.slope)
plot(conditions_blank$flow.rate,conditions_blank$v.slope)
plot(conditions_blank$light,conditions_blank$v.slope)
plot(conditions_blank$guano,conditions_blank$v.slope)
conditions.rf.v.intercept <- randomForest(v.intercept ~ .,
data=select(conditions_data,c(1,2,3,4,14)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.v.intercept)
varImpPlot(conditions.rf.v.intercept)
conditions_blank$v.intercept <- predict(
conditions.rf.v.intercept,conditions_blank[,1:4])
fit.v.intercept <- predict(
conditions.rf.v.intercept,conditions_data[,1:4])
plot(fit.v.intercept,conditions_data$v.intercept)
plot(conditions_blank$chlorophyll,conditions_blank$v.intercept)
plot(conditions_blank$flow.rate,conditions_blank$v.intercept)
plot(conditions_blank$light,conditions_blank$v.intercept)
plot(conditions_blank$guano,conditions_blank$v.intercept)
conditions.rf.v.sigma <- randomForest(v.sigma ~ .,
data=select(conditions_data,c(1,2,3,4,15)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.v.sigma)
varImpPlot(conditions.rf.v.sigma)
conditions_blank$v.sigma <- predict(
conditions.rf.v.sigma,conditions_blank[,1:4])
fit.v.sigma <- predict(conditions.rf.v.sigma,conditions_data[,1:4])
plot(fit.v.sigma,conditions_data$v.sigma)
plot(conditions_blank$chlorophyll,conditions_blank$v.sigma)
plot(conditions_blank$flow.rate,conditions_blank$v.sigma)
plot(conditions_blank$light,conditions_blank$v.sigma)
plot(conditions_blank$guano,conditions_blank$v.sigma)
Now we save the each of the model outcomes and predictions, fits, slopes, intercepts and variance into a data file so we can call on it for the model simulations.
save(conditions.rf.vel.slope,
conditions.rf.vel.intercept,
conditions.rf.vel.sigma,
conditions.rf.h.slope,
conditions.rf.h.intercept,
conditions.rf.h.sigma,
conditions.rf.v.slope,
conditions.rf.v.intercept,
conditions.rf.v.sigma,
file='notebook13-rf-2023.11.08data.RData') ## this is the file name we are saving it as that the model simulations will call on (See notebook14)